In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, precision_score, recall_score
#from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, KFold
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords, wordnet
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
train_df.head()

Unnamed: 0,Text,Sentiment
4201,ove GNW!! 9-13 Calls are making me feel better...,1
387,"CSN option trader buys 1,500 of the Jan 11-16 ...",1
4385,people slag AAP for cannibalization but samsun...,1
5773,"Sensex opens 166 points lower at 35,469, Nifty...",-1
2348,CSOD Conf Call: CEO: feeling good about our po...,1


In [5]:
test_df.head()

Unnamed: 0,Text,Sentiment
1891,MCP take-over chatter... (I know don't laugh...),-1
1550,"AMZN 1,200 lot bid in the Feb weekly 255P. 27 ...",-1
1049,KO made with sugar is sold at local COST. Cons...,1
2523,HNZ Another American Institution sold to forei...,-1
156,Will watch the close carefully then decide whe...,-1


In [6]:
df["text_lower"] = df["Text"].str.lower()
df.head()

Unnamed: 0,Text,Sentiment,text_lower
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...
3,MNTA Over 12.00,1,mnta over 12.00
4,OI Over 21.37,1,oi over 21.37


In [7]:
# drop the new column created in last cell
#df.drop(["text_lower"], inplace=True)

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_wo_punct"] = df["Text"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_punct
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,user AAP MOVIE 55 return for the FEAGEED indic...
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,user Id be afraid to short AMZN they are look...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137


In [8]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [9]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_punct,text_wo_stop
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,user AAP MOVIE 55 return for the FEAGEED indic...,user AAP MOVIE 55 return FEAGEED indicator 15 ...
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,user Id be afraid to short AMZN they are look...,user Id afraid short AMZN looking like nearmon...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200,MNTA Over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137,OI Over 2137


In [10]:
from collections import Counter
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('AAP', 920),
 ('user', 641),
 ('I', 459),
 ('short', 341),
 ('today', 291),
 ('like', 278),
 ('volume', 269),
 ('long', 253),
 ('day', 240),
 ('BAC', 202)]

In [11]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_wo_stopfreq"] = df["text_wo_stop"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_punct,text_wo_stop,text_wo_stopfreq
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,user AAP MOVIE 55 return for the FEAGEED indic...,user AAP MOVIE 55 return FEAGEED indicator 15 ...,MOVIE 55 return FEAGEED indicator 15 trades ye...
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,user Id be afraid to short AMZN they are look...,user Id afraid short AMZN looking like nearmon...,Id afraid AMZN looking nearmonopoly eBooks inf...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200,MNTA Over 1200,MNTA Over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137,OI Over 2137,OI Over 2137


In [12]:
# Drop the two columns which are no more needed 
df.drop(["text_wo_punct", "text_wo_stop"], axis=1, inplace=True)

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text_wo_stopfreqrare"] = df["text_wo_stopfreq"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_stopfreq,text_wo_stopfreqrare
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,MOVIE 55 return FEAGEED indicator 15 trades ye...,MOVIE 55 return FEAGEED indicator 15 trades ye...
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,Id afraid AMZN looking nearmonopoly eBooks inf...,Id afraid AMZN looking nearmonopoly eBooks inf...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200,MNTA Over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137,OI Over 2137


In [13]:
from nltk.stem.porter import PorterStemmer

# Drop the two columns 
#df.drop(["text_wo_stopfreq", "text_wo_stopfreqrare"], axis=1, inplace=True) 

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["text_wo_stopfreqrare"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_stopfreq,text_wo_stopfreqrare,text_stemmed
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,kicker watchlist xide tit soq pnk cpw bpz aj t...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,MOVIE 55 return FEAGEED indicator 15 trades ye...,MOVIE 55 return FEAGEED indicator 15 trades ye...,movi 55 return feage indic 15 trade year awesom
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,Id afraid AMZN looking nearmonopoly eBooks inf...,Id afraid AMZN looking nearmonopoly eBooks inf...,id afraid amzn look nearmonopoli ebook infrast...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200,MNTA Over 1200,mnta over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137,OI Over 2137,oi over 2137


In [14]:
from nltk.stem.snowball import SnowballStemmer
#SnowballStemmer.languages

In [15]:
#import nltk
#nltk.download()

In [16]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["text_stemmed"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_stopfreq,text_wo_stopfreqrare,text_stemmed,text_lemmatized
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,kicker watchlist xide tit soq pnk cpw bpz aj t...,kicker watchlist xide tit soq pnk cpw bpz aj t...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,MOVIE 55 return FEAGEED indicator 15 trades ye...,MOVIE 55 return FEAGEED indicator 15 trades ye...,movi 55 return feage indic 15 trade year awesom,movi 55 return feage indic 15 trade year awesom
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,Id afraid AMZN looking nearmonopoly eBooks inf...,Id afraid AMZN looking nearmonopoly eBooks inf...,id afraid amzn look nearmonopoli ebook infrast...,id afraid amzn look nearmonopoli ebook infrast...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200,MNTA Over 1200,mnta over 1200,mnta over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137,OI Over 2137,oi over 2137,oi over 2137


In [17]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
df["Text_url"] = df["text_lemmatized"].apply(remove_urls)
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_stopfreq,text_wo_stopfreqrare,text_stemmed,text_lemmatized,Text_url
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,kicker watchlist xide tit soq pnk cpw bpz aj t...,kicker watchlist xide tit soq pnk cpw bpz aj t...,kicker watchlist xide tit soq pnk cpw bpz aj t...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,MOVIE 55 return FEAGEED indicator 15 trades ye...,MOVIE 55 return FEAGEED indicator 15 trades ye...,movi 55 return feage indic 15 trade year awesom,movi 55 return feage indic 15 trade year awesom,movi 55 return feage indic 15 trade year awesom
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,Id afraid AMZN looking nearmonopoly eBooks inf...,Id afraid AMZN looking nearmonopoly eBooks inf...,id afraid amzn look nearmonopoli ebook infrast...,id afraid amzn look nearmonopoli ebook infrast...,id afraid amzn look nearmonopoli ebook infrast...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200,MNTA Over 1200,mnta over 1200,mnta over 1200,mnta over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137,OI Over 2137,oi over 2137,oi over 2137,oi over 2137


In [18]:
!pip install pyspellchecker



In [19]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
df["Text_spell"] = df["Text_url"].apply(correct_spellings)
df.head()

Unnamed: 0,Text,Sentiment,text_lower,text_wo_stopfreq,text_wo_stopfreqrare,text_stemmed,text_lemmatized,Text_url,Text_spell
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...,kicker watchlist xide tit soq pnk cpw bpz aj t...,kicker watchlist xide tit soq pnk cpw bpz aj t...,kicker watchlist xide tit soq pnk cpw bpz aj t...,kicker watchlist side tit so pink cow biz aj t...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user: aap movie. 55% return for the fea/geed i...,MOVIE 55 return FEAGEED indicator 15 trades ye...,MOVIE 55 return FEAGEED indicator 15 trades ye...,movi 55 return feage indic 15 trade year awesom,movi 55 return feage indic 15 trade year awesom,movi 55 return feage indic 15 trade year awesom,movi 55 return leave indict 15 trade year awesome
2,user I'd be afraid to short AMZN - they are lo...,1,user i'd be afraid to short amzn - they are lo...,Id afraid AMZN looking nearmonopoly eBooks inf...,Id afraid AMZN looking nearmonopoly eBooks inf...,id afraid amzn look nearmonopoli ebook infrast...,id afraid amzn look nearmonopoli ebook infrast...,id afraid amzn look nearmonopoli ebook infrast...,id afraid amen look nearmonopoli book infrastr...
3,MNTA Over 12.00,1,mnta over 12.00,MNTA Over 1200,MNTA Over 1200,mnta over 1200,mnta over 1200,mnta over 1200,meta over 1200
4,OI Over 21.37,1,oi over 21.37,OI Over 2137,OI Over 2137,oi over 2137,oi over 2137,oi over 2137,oi over 2137


In [20]:
train_df["Text_proceed"] = train_df["Text_spell"].apply(correct_spellings)
test_df["Text_proceed"] = test_df["Text_spell"].apply(correct_spellings)

KeyError: 'Text_spell'

In [None]:
train_df["Text_proceed"]

In [None]:
COLS_TO_FIT = "Text_proceed"
TARGET_COL = "Sentiment"

In [None]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [None]:
test_df.head()

In [None]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    pr = model.predict(test_df[COLS_TO_FIT])
    out = accuracy_score(test_df[TARGET_COL], pr)
    print(f"out: {accuracy_score(test_df[TARGET_COL], pr):.3f}")
    return out

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("model", LogisticRegression()),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
model = api.load("word2vec-google-news-300")

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9))),
    ("model", LogisticRegression(C=3.91, penalty="l1", solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 3))),
    ("model", LogisticRegression(C=4.99, penalty="l2", solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 9))),
    ("model", LogisticRegression(C=3.9, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

In [None]:
train_and_validate(pipe, train_df, test_df)

In [None]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=20, penalty="l1", solver="saga")),
])

In [None]:
train_and_validate(pipe, train_df, test_df)