In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer, RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin
import string

In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
stem = SnowballStemmer("english")
lemma = WordNetLemmatizer()

In [5]:
stopwords_english = stopwords.words("english")

In [6]:
stopwords_english = set(stopwords_english)

In [7]:
stopwords_english.remove("not")

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
# Функция для очистки текста
def clean_text(text):
    text = text.lower()
    regular = r'[\*+\#+\№\"\-+\+\=+\?+\&\^\.+\;\,+\>+\(\)\/+\:\\+]'
    regular_url = r'(http\S+)|(www\S+)|([\w\d]+www\S+)|([\w\d]+http\S+)'
    text = re.sub(regular, '', text)
    text = re.sub(regular_url, r'URL', text)
    text = re.sub(r'(\d+\s\d+)|(\d+)',' NUM ', text)
    text = re.sub(r'\s+', ' ', text)
    return text
  
cleaned_text = []
# создаем список для хранения токенов
tokens = []
 
# для каждого сообщения text из столбца data['Message']
for text in df['Text']:
    # очищаем данные и сохраняем результат в списке cleaned_text  
    text = clean_text(text)
    cleaned_text.append(text)
     
    #разбиваем текст на токены с сохраняем результат в списке tokens
    text = word_tokenize(text)
    tokens.append(text)

In [10]:
train_df["Text_proceed"] = train_df["Text"].apply(clean_text)
test_df["Text_proceed"] = test_df["Text"].apply(clean_text)

In [11]:
COLS_TO_FIT = "Text_proceed"
TARGET_COL = "Sentiment"

In [12]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    forecast = model.predict(test_df[COLS_TO_FIT])
    answer = accuracy_score(test_df[TARGET_COL], forecast)
    print(f"answer: {accuracy_score(test_df[TARGET_COL], forecast):.3f}")
    return answer

In [13]:
model = api.load("word2vec-google-news-300")

In [14]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [15]:
random_state=None

In [38]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(solver="saga", C=5.7, penalty="l1")),
])

In [39]:
train_and_validate(pipe, train_df, test_df)

answer: 0.813




0.8127696289905091

In [44]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 1))),
    ("model", LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=1, C= 5)),
])

In [45]:
train_and_validate(pipe, train_df, test_df)

answer: 0.802




0.8015530629853321

In [16]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(solver="saga", C=5.89, penalty="l2")),
])

In [17]:
train_and_validate(pipe, train_df, test_df)

answer: 0.802




0.8015530629853321