In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
def clean_text(text):
    # приводим текст к нижнему регистру
    text = text.lower()
    # создаем регулярное выражение для удаления лишних символов
    regular = r'[\*+\#+\№\"\-+\+\=+\?+\&\^\.+\;\,+\>+\(\)\/+\:\\+]'
    # регулярное выражение для замены ссылки на "URL"
    regular_url = r'(http\S+)|(www\S+)|([\w\d]+www\S+)|([\w\d]+http\S+)'
    # удаляем лишние символы
    text = re.sub(regular, '', text)
    # заменяем ссылки на "URL"
    text = re.sub(regular_url, r'URL', text)
    # заменяем числа и цифры на ' NUM '
    text = re.sub(r'(\d+\s\d+)|(\d+)',' NUM ', text)
    # удаляем лишние пробелы
    text = re.sub(r'\s+', ' ', text)
    # возвращаем очищенные данные
    return text
 
# создаем список для хранения преобразованных данных 
processed_text = []
# загружаем стоп-слова для английского языка
stop_words = stopwords.words('english')
# инициализируем лемматайзер 
lemmatizer = WordNetLemmatizer()
 
# для каждого сообщения text из столбца data['Message']
for text in df['Text']:
    # cleaning 
    text = clean_text(text)   
    # tokenization
    text = wordpunct_tokenize(text)       
    # удаление стоп-слов
    text = [word for word in text if word not in stop_words]     
    # лемматизация
    text = [lemmatizer.lemmatize(w) for w in text]
     
    # добавляем преобразованный текст в список processed_text
    processed_text.append(text)

In [5]:
train_df["Text_pr"] = train_df["Text"].apply(clean_text)
test_df["Text_pr"] = test_df["Text"].apply(clean_text)

In [6]:
CTF = "Text_pr"
TC = "Sentiment"

In [7]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [8]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[CTF], train_df[TC])
    mod_predict = model.predict(test_df[CTF])
    tochnost = accuracy_score(test_df[TC], mod_predict)
    print(f"tochnost: {accuracy_score(test_df[TC], mod_predict):.3f}")
    return tochnost

In [69]:
random_state=None

In [70]:
model = api.load("word2vec-google-news-300")

In [71]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(1, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [102]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 4), min_df=1, max_df=0.1)),
    ("model", LogisticRegression(solver="saga", C=1.35, penalty="l1")),
])

In [103]:
train_and_validate(pipe, train_df, test_df)

tochnost: 0.813




0.8127696289905091

In [74]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 3), min_df=1, max_df=0.1)),
    ("model", LogisticRegression(solver="saga", C=49.99, penalty="l2")),
])

In [75]:
train_and_validate(pipe, train_df, test_df)

tochnost: 0.806




0.8058671268334772

In [100]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 3), min_df=1, max_df=0.1)),
    ("model", LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.5, C= 9.99)),
])

In [101]:
train_and_validate(pipe, train_df, test_df)

tochnost: 0.810




0.8101811906816221