In [4]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, precision_score, recall_score
#from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, KFold
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin


In [5]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [6]:
df.shape

(5791, 2)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB


In [8]:
#train_df["Text"].sample().iloc[0]

In [46]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [47]:
train_df.head()

Unnamed: 0,Text,Sentiment
4201,ove gnw!! 9-13 calls are making me feel better...,1
387,"csn option trader buys 1,500 of the jan 11-16 ...",1
4385,people slag aap for cannibalization but samsun...,1
5773,"sensex opens 166 points lower at 35,469, nifty...",-1
2348,csod conf call: ceo: feeling good about our po...,1


In [48]:
test_df.head()

Unnamed: 0,Text,Sentiment
1891,mcp take-over chatter... (i know don't laugh...),-1
1550,"amzn 1,200 lot bid in the feb weekly 255p. 27 ...",-1
1049,ko made with sugar is sold at local cost. cons...,1
2523,hnz another american institution sold to forei...,-1
156,will watch the close carefully then decide whe...,-1


In [49]:
df["Text"] = df["Text"].str.lower()
df.head()

Unnamed: 0,Text,Sentiment
0,kickers on my watchlist xide tit soq pnk cpw b...,1
1,user: aap movie. 55% return for the fea/geed i...,1
2,user i'd be afraid to short amzn - they are lo...,1
3,mnta over 12.00,1
4,oi over 21.37,1


In [50]:
stem = SnowballStemmer("english")
lemma = WordNetLemmatizer()

In [51]:
stem.stem("Earnings")
lemma.lemmatize("Earnings")

'Earnings'

In [52]:
stop_words = set(stopwords.words("english"))

In [53]:
stop_words.remove("not")

In [54]:
stop_words.remove("no")

In [55]:
def clean_text(text):
    # приводим текст к нижнему регистру
    text = text.lower()
    # создаем регулярное выражение для удаления лишних символов
    regular = r'[\*+\#+\№\"\-+\+\=+\?+\&\^\.+\;\,+\>+\(\)\/+\:\\+]'
    # регулярное выражение для замены ссылки на "URL"
    regular_url = r'(http\S+)|(www\S+)|([\w\d]+www\S+)|([\w\d]+http\S+)'
    # удаляем лишние символы
    text = re.sub(regular, '', text)
    # заменяем ссылки на "URL"
    text = re.sub(regular_url, r'URL', text)
    # заменяем числа и цифры на ' NUM '
    text = re.sub(r'(\d+\s\d+)|(\d+)',' NUM ', text)
    # удаляем лишние пробелы
    text = re.sub(r'\s+', ' ', text)
    # возвращаем очищенные данные
    return text
 
# создаем список для хранения очищенных данных
cleaned_text = []
# для каждого сообщения text из столбца data['Message']
for text in df['Text']:
    # очищаем данные  
    text = clean_text(text)
    # добавляем очищенные данные в список cleaned_text
    cleaned_text.append(text)
# записываем очищенные данные в новую колонку 'Cleaned_msg'
#df['Cleaned_msg'] = cleaned_text

In [56]:
#list(train_df["Text"].sample(20))

In [57]:
train_df["Text_proceed"] = train_df["Text"].apply(clean_text)
test_df["Text_proceed"] = test_df["Text"].apply(clean_text)

In [58]:
train_df["Text_proceed"]

4201    ove gnw!! NUM calls are making me feel better ...
387     csn option trader buys NUM of the jan NUM call...
4385    people slag aap for cannibalization but samsun...
5773    sensex opens NUM points lower at NUM nifty sta...
2348    csod conf call ceo feeling good about our posi...
                              ...                        
3772    aap we break and close below this support NUM ...
5191    new industry data provide the first hard look ...
5226    rt @jchengwsj in hindsight wall street probabl...
5390    global stocks fall after president trump issue...
860     hpq how many upgrades tomorrow assuming autono...
Name: Text_proceed, Length: 4632, dtype: object

In [59]:
COLS_TO_FIT = "Text_proceed"
TARGET_COL = "Sentiment"

In [60]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [61]:
test_df.head()

Unnamed: 0,Text,Sentiment,Text_proceed
1891,mcp take-over chatter... (i know don't laugh...),-1,mcp takeover chatter i know don't laugh
1550,"amzn 1,200 lot bid in the feb weekly 255p. 27 ...",-1,amzn NUM lot bid in the feb weekly NUM p NUM d...
1049,ko made with sugar is sold at local cost. cons...,1,ko made with sugar is sold at local cost consu...
2523,hnz another american institution sold to forei...,-1,hnz another american institution sold to forei...
156,will watch the close carefully then decide whe...,-1,will watch the close carefully then decide whe...


In [62]:
random_state=None

In [63]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    pr = model.predict(test_df[COLS_TO_FIT])
    mse = mean_squared_error(test_df[TARGET_COL], pr)
    out = accuracy_score(test_df[TARGET_COL], pr)
    print(f"mse: {mean_squared_error(test_df[TARGET_COL], pr):.3f}")
    print(f"mae: {mean_absolute_error(test_df[TARGET_COL], pr):.3f}")
    print(f"out: {accuracy_score(test_df[TARGET_COL], pr):.3f}")
    return out

In [64]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression()),
])

In [65]:
train_and_validate(pipe, train_df, test_df)

mse: 0.808
mae: 0.404
out: 0.798


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7981018119068162

In [66]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 1), min_df = 2, max_df=0.1)),
    ("model", LogisticRegression()),
])

In [67]:
random_state=None

In [68]:
train_and_validate(pipe, train_df, test_df)

mse: 0.794
mae: 0.397
out: 0.802


0.8015530629853321

In [69]:
random_state=None

In [70]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=3.9, penalty="elasticnet", solver="saga", l1_ratio=1))
])

In [71]:
train_and_validate(pipe, train_df, test_df)

mse: 0.745
mae: 0.373
out: 0.814




0.813632441760138

In [72]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression()),
])

In [73]:
train_and_validate(pipe, train_df, test_df)

mse: 0.808
mae: 0.404
out: 0.798


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7981018119068162

In [74]:
model = api.load("word2vec-google-news-300")

In [75]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=0.91, penalty="l1", solver="saga")),
])

In [76]:
train_and_validate(pipe, train_df, test_df)



mse: 0.766
mae: 0.383
out: 0.808


0.808455565142364

In [77]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=3.9, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

In [78]:
train_and_validate(pipe, train_df, test_df)

mse: 0.752
mae: 0.376
out: 0.812




0.8119068162208801

In [79]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9))),
    ("model", LogisticRegression(C=3.91, penalty="l1", solver="saga")),
])

In [80]:
train_and_validate(pipe, train_df, test_df)



mse: 0.728
mae: 0.364
out: 0.818


0.817946505608283

In [81]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9))),
    ("model", LogisticRegression(C=3.99, penalty="l1", solver="saga")),
])

# самый лучший результат

In [82]:
train_and_validate(pipe, train_df, test_df)



mse: 0.725
mae: 0.362
out: 0.819


0.818809318377912

In [43]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9))),
    ("model", LogisticRegression(C=9.99, penalty="l1", solver="saga")),
])

In [44]:
train_and_validate(pipe, train_df, test_df)



mse: 0.773
mae: 0.387
out: 0.807


0.8067299396031061

In [45]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 1), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=0.99, penalty="l2", solver="saga")),
])

In [46]:
train_and_validate(pipe, train_df, test_df)

mse: 0.787
mae: 0.393
out: 0.803




0.8032786885245902

In [47]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=3.9, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

In [48]:
train_and_validate(pipe, train_df, test_df)

mse: 0.787
mae: 0.393
out: 0.803




0.8032786885245902

pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=0.91, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

train_and_validate(pipe, train_df, test_df)

In [49]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vectorizer', 'model', 'vectorizer__analyzer', 'vectorizer__binary', 'vectorizer__decode_error', 'vectorizer__dtype', 'vectorizer__encoding', 'vectorizer__input', 'vectorizer__lowercase', 'vectorizer__max_df', 'vectorizer__max_features', 'vectorizer__min_df', 'vectorizer__ngram_range', 'vectorizer__preprocessor', 'vectorizer__stop_words', 'vectorizer__strip_accents', 'vectorizer__token_pattern', 'vectorizer__tokenizer', 'vectorizer__vocabulary', 'model__C', 'model__class_weight', 'model__dual', 'model__fit_intercept', 'model__intercept_scaling', 'model__l1_ratio', 'model__max_iter', 'model__multi_class', 'model__n_jobs', 'model__penalty', 'model__random_state', 'model__solver', 'model__tol', 'model__verbose', 'model__warm_start'])

parameters = {
    'model__penalty': ["l1"],
    'vectorizer__ngram_range': [(1,9)],
   # 'vectorizer__max_df': np.arange(0.1, 0.9, 0.1),
    #'vectorizer__min_df': np.arange(1, 10, 1),
    #'model__l1_ratio' : [0.1],
    'model__solver': ["saga"],
    'model__C': np.arange(2.5)
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_


cv.best_estimator_.named_steps["model"]

parameters = {
    'model__penalty': ["l1"],
    'vectorizer__ngram_range': [(1,9)],
    'vectorizer__max_df': [(0.1)],
    'vectorizer__min_df': [(1)],
    #'model__l1_ratio' : [0.1],
    'model__solver': ["saga"],
    'model__C': np.arange(0.01, 1, 0.01)
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_
#{'model__C': 0.91,
 #'model__penalty': 'l1',
 #'model__solver': 'saga',
 #'vectorizer__max_df': 0.1,
 #'vectorizer__min_df': 1,
 #'vectorizer__ngram_range': (1, 9)}

cv.best_estimator_.named_steps["model"]
#LogisticRegression(C=0.91, penalty='l1', solver='saga')

parameters = {
    #'model__penalty': ["l1"],
    #'vectorizer__ngram_range': [(1,9)],
    #'vectorizer__max_df': [(0.1)],
    #'vectorizer__min_df': [(1)],
    #'model__l1_ratio' : [0.1],
    #'model__solver': ["saga"],
    'model__C': np.arange(1, 10, 1)
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_
#{'model__C': 1}

cv.best_estimator_.named_steps["model"]

parameters = {
    #'model__penalty': ["l1"],
    #'vectorizer__ngram_range': [(1,9)],
    #'vectorizer__max_df': [(0.1)],
    #'vectorizer__min_df': [(1)],
    #'model__l1_ratio' : [0.1],
    #'model__solver': ["saga"],
    'model__C': np.arange(2.85, 2.95, 0.01)
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_
#{'model__C': 2.85}

cv.best_estimator_.named_steps["model"]

parameters = {
    #'model__penalty': ["l1"],
    #'vectorizer__ngram_range': [(1,9)],
    #'vectorizer__max_df': [(0.1)],
    #'vectorizer__min_df': [(1)],
    #'model__l1_ratio' : [0.1],
    #'model__solver': ["saga"],
    'model__C': np.arange(10, 99, 1)
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_

#{'model__C': 10}

cv.best_estimator_.named_steps["model"]

In [50]:
parameters = {
    'model__penalty': ["l1", "l2", "elasticnet"],
    #'vectorizer__ngram_range': [(1,9)],
    #'vectorizer__max_df': [(0.1)],
    #'vectorizer__min_df': [(1)],
    #'model__l1_ratio' : [0.1],
    #'model__solver': ["saga"],
    'model__C': np.arange(0.85, 0.95, 0.01)
}

In [51]:
cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1)

In [None]:
cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

In [None]:
cv.best_score_

In [None]:
cv.best_params_


In [None]:
cv.best_estimator_.named_steps["model"]

parameters = {
    #'model__penalty': ["l1"],
    #'vectorizer__ngram_range': [(1,9)],
    #'vectorizer__max_df': [(0.1)],
    #'vectorizer__min_df': [(1)],
    #'model__l1_ratio' : [0.1],
    #'model__solver': ["saga"],
    'model__C': np.arange(10.3, 10.42, 0.01)
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_


cv.best_estimator_.named_steps["model"]

parameters = {
    'model__penalty': ["l1"],
    'vectorizer__ngram_range': [(1,9)],
    'vectorizer__max_df':np.arange(0.01, 1, 0.01),
    'vectorizer__min_df': [(1)],
    #'model__l1_ratio' : [0.1],
    'model__solver': ["saga"],
    'model__C': [(0.91)]
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_

cv.best_estimator_.named_steps["model"]

parameters = {
    'model__penalty': ["l1"],
    #'vectorizer__ngram_range': np.arange(1,9),
    #'vectorizer__max_df':np.arange(0.5),
    'vectorizer__min_df': np.arange(1, 10, 1),
    #'model__l1_ratio' : [0.1],
    'model__solver': ["saga"],
    'model__C': [0.91]
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True, random_state=None), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_
{'model__C': 0.91,
 'model__penalty': 'l1',
 'model__solver': 'saga',
 'vectorizer__min_df': 3}

cv.best_estimator_.named_steps["model"]

parameters = {
    'model__penalty': ["elasticnet"],
    #'vectorizer__ngram_range': np.arange(1,9),
    #'vectorizer__max_df':np.arange(0.5),
    #'vectorizer__min_df': np.arange(1, 100, 1),
    'model__l1_ratio' : np.arange(0.01, 1, 0.05),
    'model__solver': ["saga"],
    'model__C': [0.91]
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True, random_state=None), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_
​
{'model__C': 0.91,
 'model__l1_ratio': 0.060000000000000005,
 'model__penalty': 'elasticnet',
 'model__solver': 'saga'}

cv.best_estimator_.named_steps["model"]
LogisticRegression(C=0.91, l1_ratio=0.060000000000000005, penalty='elasticnet',
                   solver='saga')

In [None]:
pipe = Pipeline([
    
    
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression(solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

# Лучший результат

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=0.93, penalty="l1", solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9), min_df = 2, max_df=0.8)),
    ("model", LogisticRegression(C=0.93, penalty="l1", solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9))),
    ("model", LogisticRegression(C=0.93, penalty="l1", solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
parameters = {
    'model__penalty': ["l1"],
    'vectorizer__ngram_range': [(1,9)],
    'vectorizer__max_df':np.arange(0.1, 0.5, 0.1),
    'vectorizer__min_df': np.arange(1, 6, 1),
    'model__solver': ["saga"],
    'model__C': np.arange(0.90, 0.95, 0.01)
}

In [None]:
cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True, random_state=None), n_jobs=-1)

In [None]:
cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

In [None]:
cv.best_score_

In [None]:
cv.best_params_
#{'model__C': 0.92,
# 'model__penalty': 'l1',
# 'model__solver': 'saga',
# 'vectorizer__max_df': 0.1,
# 'vectorizer__min_df': 1,
# 'vectorizer__ngram_range': (1, 9)}

In [None]:
cv.best_estimator_.named_steps["model"]

   def preprocessing(token):
    # Copied from here
    token = re.sub(r"(...)our$", r"\1or", token)
    token = re.sub(r"([bt])re$", r"\1er", token)
    token = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", token)
    token = re.sub(r"ogue$", "og", token)
    return token
def correct_typo(tokens):
    spell = SpellChecker()
    return [spell.correction(t) if len(spell.unknown([t]))>0 else t for t in tokens]
        
def preprocess_text(text):
    # 1. Tokenise to alphabetic tokens
    tokeniser = RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokeniser.tokenize(text)
    
    # 2. Lowercase and lemmatise
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t.lower(), pos='v') for t in tokens]
#3. Correct spelling (this won't convert 100% )
    tokens = correct_typo(tokens)
    
    # 4. Convert British spelling to American spelling (this won't convert 100%)
    tokens = [convert_to_american(t) for t in tokens]
#5. Remove stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['cannot', 'could', 'done', 'let', 'may' 'mayn',  'might',  'must', 'need', 'ought', 'oughtn', 
                       'shall', 'would', 'br'])
    tokens = [t for t in tokens if t not in stop_words]
    
    return tokens