In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, precision_score, recall_score
#from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, KFold
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
df.shape

(5791, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB


In [5]:
#train_df["Text"].sample().iloc[0]

In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
train_df.head()

Unnamed: 0,Text,Sentiment
4201,ove GNW!! 9-13 Calls are making me feel better...,1
387,"CSN option trader buys 1,500 of the Jan 11-16 ...",1
4385,people slag AAP for cannibalization but samsun...,1
5773,"Sensex opens 166 points lower at 35,469, Nifty...",-1
2348,CSOD Conf Call: CEO: feeling good about our po...,1


In [8]:
test_df.head()

Unnamed: 0,Text,Sentiment
1891,MCP take-over chatter... (I know don't laugh...),-1
1550,"AMZN 1,200 lot bid in the Feb weekly 255P. 27 ...",-1
1049,KO made with sugar is sold at local COST. Cons...,1
2523,HNZ Another American Institution sold to forei...,-1
156,Will watch the close carefully then decide whe...,-1


In [9]:
stem = SnowballStemmer("english")
lemma = WordNetLemmatizer()

In [10]:
#stem.stem("Earnings")
#lemma.lemmatize("Earnings")

In [11]:
stop_words = set(stopwords.words("english"))

In [12]:
stop_words.remove("not")

In [13]:
stop_words.remove("no")

In [16]:
def clean_text(text):
    # приводим текст к нижнему регистру
    text = text.lower()
    # создаем регулярное выражение для удаления лишних символов
    regular = r'[\*+\#+\№\"\-+\+\=+\?+\&\^\.+\;\,+\>+\(\)\@\/+\:\\+]'
    # регулярное выражение для замены ссылки на "URL"
    regular_url = r'(http\S+)|(www\S+)|([\w\d]+www\S+)|([\w\d]+http\S+)'
    # удаляем лишние символы
    text = re.sub(regular, '', text)
    # заменяем ссылки на "URL"
    text = re.sub(regular_url, r'URL', text)
    # заменяем числа и цифры на ' NUM '
    text = re.sub(r'(\d+\s\d+)|(\d+)',' NUM ', text)
    # удаляем лишние пробелы
    text = re.sub(r'\s+', ' ', text)
    # возвращаем очищенные данные
    return text
    
#stop_words.extend(['cannot', 'could', 'done', 'let', 'may' 'mayn',  'might',  'must', 'need', 'ought', 'oughtn', 
                       #'shall', 'would', 'br'])
# создаем список для хранения очищенных данных
cleaned_text = []
# для каждого сообщения text из столбца data['Message']
for text in df['Text']:
    # очищаем данные  
    text = clean_text(text)
    # добавляем очищенные данные в список cleaned_text
    cleaned_text.append(text)
# записываем очищенные данные в новую колонку 'Cleaned_msg'
#df['Cleaned_msg'] = cleaned_text

In [17]:
#list(train_df["Text"].sample(20))

In [18]:
train_df["Text_proceed"] = train_df["Text"].apply(clean_text)
test_df["Text_proceed"] = test_df["Text"].apply(clean_text)

In [19]:
train_df["Text_proceed"]

4201    ove gnw!! NUM calls are making me feel better ...
387     csn option trader buys NUM of the jan NUM call...
4385    people slag aap for cannibalization but samsun...
5773    sensex opens NUM points lower at NUM nifty sta...
2348    csod conf call ceo feeling good about our posi...
                              ...                        
3772    aap we break and close below this support NUM ...
5191    new industry data provide the first hard look ...
5226    rt jchengwsj in hindsight wall street probably...
5390    global stocks fall after president trump issue...
860     hpq how many upgrades tomorrow assuming autono...
Name: Text_proceed, Length: 4632, dtype: object

In [20]:
COLS_TO_FIT = "Text_proceed"
TARGET_COL = "Sentiment"

In [21]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [22]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 1), min_df = 18, max_df=0.1)),
    ("model", LogisticRegression()),
])

In [23]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    pr = model.predict(test_df[COLS_TO_FIT])
    mse = mean_squared_error(test_df[TARGET_COL], pr)
    out = accuracy_score(test_df[TARGET_COL], pr)
    print(f"mse: {mean_squared_error(test_df[TARGET_COL], pr):.3f}")
    print(f"mae: {mean_absolute_error(test_df[TARGET_COL], pr):.3f}")
    print(f"out: {accuracy_score(test_df[TARGET_COL], pr):.3f}")
    return out

In [24]:
train_and_validate(pipe, train_df, test_df)

mse: 0.966
mae: 0.483
out: 0.758


0.7584124245038827

In [25]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression()),
])

In [26]:
train_and_validate(pipe, train_df, test_df)

mse: 0.808
mae: 0.404
out: 0.798


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7981018119068162

In [27]:
model = api.load("word2vec-google-news-300")

# Лучший результат

In [30]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=1, penalty="l1", solver="saga")),
])

In [31]:
train_and_validate(pipe, train_df, test_df)



mse: 0.777
mae: 0.388
out: 0.806


0.8058671268334772

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 1), min_df = 5, max_df=0.9)),
    ("model", LogisticRegression(C=2, penalty="l2", solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9), min_df = 5, max_df=0.1)),
    ("model", LogisticRegression(C=2, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression()),
])

In [None]:
pipe.get_params().keys()

parameters = {
    'model__penalty': ["l1"],
    'vectorizer__ngram_range': [(1,9)],
   # 'vectorizer__max_df': np.arange(0.1, 0.9, 0.1),
    #'vectorizer__min_df': np.arange(1, 10, 1),
    #'model__l1_ratio' : [0.1],
    'model__solver': ["saga"],
    'model__C': np.arange(2.5, 3, 0.1)
}

cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

cv.best_score_

cv.best_params_


In [None]:
cv.best_estimator_.named_steps["model"]

In [None]:
parameters = {
    'model__penalty': ["l1"],
    'vectorizer__ngram_range': [(1,9)],
    'vectorizer__max_df': [(0.1)],
    'vectorizer__min_df': np.arange(1, 2, 1),
    #'model__l1_ratio' : [0.1],
    'model__solver': ["saga"],
    'model__C': np.arange(1, 2.5, 1.5)
}

In [None]:
cv = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(5, shuffle=True), n_jobs=-1)

In [None]:
cv.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])

In [None]:
cv.best_score_

In [None]:
cv.best_params_


In [None]:
cv.best_estimator_.named_steps["model"]

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression(solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

   def preprocessing(token):
    # Copied from here
    token = re.sub(r"(...)our$", r"\1or", token)
    token = re.sub(r"([bt])re$", r"\1er", token)
    token = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", token)
    token = re.sub(r"ogue$", "og", token)
    return token
def correct_typo(tokens):
    spell = SpellChecker()
    return [spell.correction(t) if len(spell.unknown([t]))>0 else t for t in tokens]
        
def preprocess_text(text):
    # 1. Tokenise to alphabetic tokens
    tokeniser = RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokeniser.tokenize(text)
    
    # 2. Lowercase and lemmatise
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t.lower(), pos='v') for t in tokens]
# 3. Correct spelling (this won't convert 100% )
    tokens = correct_typo(tokens)
    
    # 4. Convert British spelling to American spelling (this won't convert 100%)
    tokens = [convert_to_american(t) for t in tokens]
# 5. Remove stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['cannot', 'could', 'done', 'let', 'may' 'mayn',  'might',  'must', 'need', 'ought', 'oughtn', 
                       'shall', 'would', 'br'])
    tokens = [t for t in tokens if t not in stop_words]
    
    return tokens