In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
stem = SnowballStemmer("english")
lemma = WordNetLemmatizer()

In [5]:
stopwords_english = stopwords.words("english")

In [6]:
stopwords_english = set(stopwords_english)

In [7]:
stopwords_english.remove("not")

In [8]:
IS_ALPHA = re.compile("[\w!?]+")


def preprocess(text):
    text = text.replace("&", " and ")
    text = text.replace("n't", " not ")
    text = text.lower()
    text_tokens = wordpunct_tokenize(text)
    text_tokens = [lemma.lemmatize(token) for token in text_tokens if IS_ALPHA.match(token)]
    text_tokens = [token for token in text_tokens if token not in stopwords_english]
    text = " ".join(text_tokens)
    return text.strip()

In [9]:
train_df["Text_proceed"] = train_df["Text"].apply(preprocess)
test_df["Text_proceed"] = test_df["Text"].apply(preprocess)

In [10]:
COLS_TO_FIT = "Text"
TARGET_COL = "Sentiment"

In [11]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [12]:
model = api.load("word2vec-google-news-300")

In [13]:
random_state= None

In [14]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    prod = model.predict(test_df[COLS_TO_FIT])
    acc = accuracy_score(test_df[TARGET_COL], prod)
    print(f"acc: {accuracy_score(test_df[TARGET_COL], prod):.3f}")
    return acc

In [19]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 4))),
    ("model", LogisticRegression(C=4.27, penalty="l1", solver="saga")),
])

In [20]:
train_and_validate(pipe, train_df, test_df)

acc: 0.811




0.811044003451251

In [15]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=2.27, penalty="l1", solver="saga")),
])

In [16]:
train_and_validate(pipe, train_df, test_df)

acc: 0.806




0.8058671268334772

In [17]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 3))),
    ("model", LogisticRegression(C=3.27, penalty="l1", solver="saga")),
])

In [18]:
train_and_validate(pipe, train_df, test_df)

acc: 0.808




0.808455565142364

In [21]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=2.27, penalty="l2", solver="saga")),
])

In [22]:
train_and_validate(pipe, train_df, test_df)

acc: 0.802




0.8015530629853321

In [23]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=2, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

In [24]:
train_and_validate(pipe, train_df, test_df)

acc: 0.809




0.8093183779119931