In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, KFold
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords, wordnet
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin
import re
import string
pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
train_df.head()

Unnamed: 0,Text,Sentiment
4201,ove GNW!! 9-13 Calls are making me feel better...,1
387,"CSN option trader buys 1,500 of the Jan 11-16 ...",1
4385,people slag AAP for cannibalization but samsun...,1
5773,"Sensex opens 166 points lower at 35,469, Nifty...",-1
2348,CSOD Conf Call: CEO: feeling good about our po...,1


In [5]:
stem = SnowballStemmer("english")
lemma = WordNetLemmatizer()

In [6]:
stopwords_english = stopwords.words("english")

In [7]:
stopwords_english = set(stopwords_english)

In [8]:
stopwords_english.remove("not")

In [9]:
stopwords_english.remove("no")

In [10]:
def clean_text(text):
    # приводим текст к нижнему регистру
    text = text.lower()
    # создаем регулярное выражение для удаления лишних символов
    regular = r'[\*+\#+\№\"\-+\+\=+\?+\&\^\.+\;\,+\>+\(\)\/+\:\\+]'
    # регулярное выражение для замены ссылки на "URL"
    regular_url = r'(http\S+)|(www\S+)|([\w\d]+www\S+)|([\w\d]+http\S+)'
    # удаляем лишние символы
    text = re.sub(regular, '', text)
    # заменяем ссылки на "URL"
    text = re.sub(regular_url, r'URL', text)
    # заменяем числа и цифры на ' NUM '
    text = re.sub(r'(\d+\s\d+)|(\d+)',' NUM ', text)
    # удаляем лишние пробелы
    text = re.sub(r'\s+', ' ', text)
    # возвращаем очищенные данные
    return text
 
# создаем список для хранения очищенных данных
cleaned_text = []
# для каждого сообщения text из столбца data['Message']
for text in df['Text']:
    # очищаем данные  
    text = clean_text(text)
    # добавляем очищенные данные в список cleaned_text
    cleaned_text.append(text)

In [11]:
test_df["Text"]

1891     MCP take-over chatter... (I know don't laugh...)
1550    AMZN 1,200 lot bid in the Feb weekly 255P. 27 ...
1049    KO made with sugar is sold at local COST. Cons...
2523    HNZ Another American Institution sold to forei...
156     Will watch the close carefully then decide whe...
                              ...                        
5684    Sensex jumps over 750 points to cross 29,000 m...
3090                      VBD may be about move back up  
203     CSX enko view, PF Box size = 1,  VEY bullish a...
339           AYI Q1 Operational Cash Flow turns Negative
837     user: NFX ising wedge. Normally bearish. Close...
Name: Text, Length: 1159, dtype: object

In [12]:
train_df["Text_proceed"] = train_df["Text"].apply(clean_text)
test_df["Text_proceed"] = test_df["Text"].apply(clean_text)

In [13]:
train_df["Text_proceed"]

4201    ove gnw!! NUM calls are making me feel better ...
387     csn option trader buys NUM of the jan NUM call...
4385    people slag aap for cannibalization but samsun...
5773    sensex opens NUM points lower at NUM nifty sta...
2348    csod conf call ceo feeling good about our posi...
                              ...                        
3772    aap we break and close below this support NUM ...
5191    new industry data provide the first hard look ...
5226    rt @jchengwsj in hindsight wall street probabl...
5390    global stocks fall after president trump issue...
860     hpq how many upgrades tomorrow assuming autono...
Name: Text_proceed, Length: 4632, dtype: object

In [14]:
COLS_TO_FIT = "Text_proceed"
TARGET_COL = "Sentiment"

In [15]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [16]:
test_df.head()

Unnamed: 0,Text,Sentiment,Text_proceed
1891,MCP take-over chatter... (I know don't laugh...),-1,mcp takeover chatter i know don't laugh
1550,"AMZN 1,200 lot bid in the Feb weekly 255P. 27 ...",-1,amzn NUM lot bid in the feb weekly NUM p NUM d...
1049,KO made with sugar is sold at local COST. Cons...,1,ko made with sugar is sold at local cost consu...
2523,HNZ Another American Institution sold to forei...,-1,hnz another american institution sold to forei...
156,Will watch the close carefully then decide whe...,-1,will watch the close carefully then decide whe...


In [17]:
random_state=None

In [18]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    predict = model.predict(test_df[COLS_TO_FIT])
    proc = accuracy_score(test_df[TARGET_COL], predict)
    print(f"proc: {accuracy_score(test_df[TARGET_COL], predict):.3f}")
    return proc

In [19]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=10.83, penalty="l1", solver="saga")),
])

In [20]:
train_and_validate(pipe, train_df, test_df)

proc: 0.812




0.8119068162208801

In [21]:
model = api.load("word2vec-google-news-300")

THE BEST

In [22]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=10.83, penalty="l1", solver="saga")),
])

In [23]:
train_and_validate(pipe, train_df, test_df)

proc: 0.813




0.8127696289905091

In [24]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 7), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=20, penalty="l1", solver="saga")),
])

In [25]:
train_and_validate(pipe, train_df, test_df)



proc: 0.799


0.7989646246764452

In [26]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 3))),
    ("model", LogisticRegression(C=4.99, penalty="l2", solver="saga")),
])

In [27]:
train_and_validate(pipe, train_df, test_df)

proc: 0.804




0.8041415012942191

In [28]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=0.82, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

In [29]:
train_and_validate(pipe, train_df, test_df)

proc: 0.804




0.8041415012942191