In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
df.shape

(5791, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB


In [5]:
train_size = int(df.shape[0] * 0.8)
train_df = df.iloc[:train_size].copy()
test_df = df.iloc[train_size:].copy()

In [6]:
train_df.shape, test_df.shape

((4632, 2), (1159, 2))

In [7]:
#df['track name'] = df['track name'].str.replace(r'\W

In [8]:
df.describe()

Unnamed: 0,Sentiment
count,5791.0
mean,0.272664
std,0.962192
min,-1.0
25%,-1.0
50%,1.0
75%,1.0
max,1.0


In [9]:
#df['Sentiment'].value_counts()

In [10]:
#sns.countplot(x='Sentiment',data=df)

In [11]:
train_df["Text"].sample().iloc[0]

'VXY Da bears wacked again? gap fill from yesterday. 10 on AAP 15% rev beat?'

In [12]:
stem = SnowballStemmer("english")
lemma = WordNetLemmatizer()

In [13]:
stopwords_english = stopwords.words("english")

In [14]:
stopwords_english = set(stopwords_english)

In [15]:
stopwords_english.remove("not")
#stopwords_english

In [16]:
IS_ALPHA = re.compile("[\w!?]+")


def preprocess(text):
    text = text.replace("&", " and ")
    text = text.replace("n't", " not ")
    text = text.lower()
    text_tokens = wordpunct_tokenize(text)
    text_tokens = [lemma.lemmatize(token) for token in text_tokens if IS_ALPHA.match(token)]
    text_tokens = [token for token in text_tokens if token not in stopwords_english]
    text = " ".join(text_tokens)
    return text.strip()

In [32]:
train_df["Text_proceed"] = train_df["Text"].apply(preprocess)
test_df["Text_proceed"] = test_df["Text"].apply(preprocess)

In [33]:
train_df["Text_proceed"]

0       kicker watchlist xide tit soq pnk cpw bpz aj t...
1       user aap movie 55 return fea geed indicator 15...
2       user afraid short amzn looking like near monop...
3                                              mnta 12 00
4                                                oi 21 37
                              ...                        
4627                                    idti like 7 41 42
4628                acad trying go higher need followthru
4629                                   get back !!! silly
4630    today watchlist ong stock jaso ziop vhc snfca ...
4631                           aap kumo earjet kumo twist
Name: Text_proceed, Length: 4632, dtype: object

In [18]:
COLS_TO_FIT = "Text_proceed"
TARGET_COL = "Sentiment"

In [19]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+")),
    ("model", LogisticRegression()),
])

In [20]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    prediction = model.predict(test_df[COLS_TO_FIT])
    mse = mean_squared_error(test_df[TARGET_COL], prediction)
    print(f"mse: {mean_squared_error(test_df[TARGET_COL], prediction):.3f}")
    print(f"mae: {mean_absolute_error(test_df[TARGET_COL], prediction):.3f}")
    return mse

In [21]:
train_and_validate(pipe, train_df, test_df)

mse: 1.726
mae: 0.863


1.7256255392579811

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 2))),
    ("model", LogisticRegression()),
])

In [23]:
train_and_validate(pipe, train_df, test_df)

mse: 1.808
mae: 0.904


1.8084555651423642

In [24]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 1), min_df=5, max_df=0.3)),
    ("model", LogisticRegression()),
])

In [25]:
train_and_validate(pipe, train_df, test_df)

mse: 1.670
mae: 0.835


1.6704055220017255

In [26]:
pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(token_pattern=r"\S+", ngram_range=(1, 1), min_df=5, max_df=0.3)),
    ("model", LogisticRegression()),
])
train_and_validate(pipe, train_df, test_df)

mse: 1.833
mae: 0.916


1.8326143226919758

In [27]:
model = api.load("word2vec-google-news-300")

In [28]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(1, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [29]:
pipe = Pipeline([
    ("vectorizer", Word2VecModel(model)),
    ("model",  LogisticRegression()),
])

In [30]:
train_and_validate(pipe, train_df, test_df)

mse: 1.805
mae: 0.903


1.805004314063848

In [31]:
#sns.scatterplot(x=train_df["Text"], y=train_df["Sentiment"]);

In [None]:
BRACETS_RE = re.compile(r"\[[^\]]*\]")
PUNK_RE= re.compile(r"[.,\"'#@%:;$* \t\n^/\\-]+")

def preprocessing(text):
    text = BRACETS_RE.sub("", text)
    text = text.lower()
    text = text.replace("&", " and ")
    text = text.replace("n't", " not ")
    text = text.replace("n'", " not ")
    text = PUNK_RE.sub(" ", text)

    text_list = wordpunct_tokenize(text)
    text_list = [lemma.lemmatize(word) for word in text_list if word not in stop_words]
    ##text_list.extend(["_".join(text_list[i:i+2]) for i in range(len(text_list)-1)])
    ##text_list = [token for token in text_list if token not in stop_english]
    text = " ".join(text_list)
    
    return text.strip()