# Testing the packages

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
sys.path.insert(1, '../functions')
import data_utilities as data_u
import dict_utilities as dict_u
import nlp_utilities as nlp_u
import time

#from jupyterthemes import jtplot
#jtplot.style()
%load_ext autoreload
%autoreload 2
%matplotlib inline

data_folder = '../data/'
dict_dir = data_folder + 'data_dict.pkl'


## Testing dict_utilities

In [50]:
import dict_utilities as dict_u

data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers already saved : {data_dict.keys()}")

dict_u.reset_dict(dict_dir)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers after reset : {data_dict.keys()}")

Tickers already saved : dict_keys(['google', 'exxon'])
Tickers after reset : dict_keys([])


## Testing data_utilities

These functions now label the data at the same time, so it can take quite a long time ! Approximately 1 min every 100 labels ... This is not optimized !

Add news to data dictionary from database

In [3]:
import data_utilities as data_u

search_words = ['Google', 'Exxon']
news_to_read = "Reuters"
data_u.add_news_to_dict(search_words, data_folder, news_to_read, dict_dir)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers added : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers added : dict_keys(['google', 'exxon'])

Number of news in ticker Google : 3608


Add news to data dictionary from Twitter account

In [51]:
date_since = "2020-11-13"
nb_items = 1000
language = "en"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_ids = ['Google', 'Total']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    dict_dir, retweet=False, from_ids=from_ids)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys([])

Tickers now : dict_keys(['google', 'total'])

Number of news in ticker Google : 1000


Add news to data dictionary from all over Twitter

In [47]:
date_since = "2020-11-13"
nb_items = 100
language = "fr"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_words = ['Google', 'Facebook']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    dict_dir, retweet=False, from_words=from_words)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys(['google', 'exxon', 'total'])

Tickers now : dict_keys(['google', 'exxon', 'total', 'facebook'])

Number of news in ticker Google : 3808


In [15]:
dict_u.reset_dict(dict_dir)
search_word = "Google"
news_to_read = "Reuters"
data_u.add_news_to_dict(search_word, data_folder, news_to_read, dict_dir)


KeyboardInterrupt: 

In [2]:
backup_dir = data_folder + 'backup_dict.pkl'
data_dict = dict_u.get_dict(backup_dir)
df = data_dict["google"]

def set_Date(example):
    return example["Date_x"] if not (pd.isnull(example["Date_x"])) else example["Date_y"]
def set_Label(example):
    return example["Label_x"] if not (pd.isnull(example["Label_x"])) else example["Label_y"]
def set_Author(example):
    return example["Author_x"] if not (isinstance(example["Author_x"], float)) else example["Author_y"]

df = df.assign(Date=df.apply(set_Date, axis=1))
df = df.assign(Author=df.apply(set_Author, axis=1))
df = df.assign(Label=df.apply(set_Label, axis=1))
df = df[["Text", "Author", "Date", "Label"]]
df.head()

Unnamed: 0,Text,Author,Date,Label
0,SAN FRANCISCO/NEW YORK (Reuters) - Wall Stree...,"[Paul Thomasch, Eric Auchard]",2006-10-20 04:25:00,-6.256256
1,FRANKFURT (Reuters) - Internet service provid...,[],2006-10-21 02:21:00,-6.256256
2,NEW YORK (Reuters) - U.S. stocks should exten...,"[ers, Chris S]",2006-10-23 05:24:00,-14.304306
3,NEW YORK (Reuters) - U.S. stocks rallied on M...,[Vivianne Rodrigues],2006-10-23 05:37:00,-14.304306
4,LOS ANGELES (Reuters) - Amazon.com on Tuesday...,"[ria Sage, Alex]",2006-10-24 07:39:00,10.390396


## Testing nlp_utilities

### BOW/TFIDF

In [4]:
import nlp_utilities as nlp_u

data_dict = dict_u.get_dict(dict_dir)
df = data_dict['google']

bow, countvect, feat2word = nlp_u.df_to_bow(df, TFIDF=True)

print("Document - words matrix:", bow.shape)
print("First words:", countvect.get_feature_names()[0:100])

Document - words matrix: (3608, 9966)
First words: ['aa', 'abandon', 'abandoned', 'abb', 'abdominal', 'abide', 'abiding', 'ability', 'able', 'aboard', 'aborted', 'abortive', 'abound', 'abrasive', 'abroad', 'abrupt', 'abruptly', 'absence', 'absent', 'absolute', 'absolutely', 'absorb', 'absorbed', 'absorbing', 'absorption', 'abstain', 'absurd', 'abu', 'abundance', 'abundant', 'abuse', 'abusive', 'abuzz', 'abyss', 'academic', 'academy', 'accelerate', 'accelerated', 'acceleration', 'accelerator', 'accelerometer', 'accent', 'accept', 'acceptable', 'acceptance', 'accepted', 'access', 'accessible', 'accessory', 'accident', 'accidental', 'accidentally', 'acclaim', 'accommodate', 'accommodating', 'accommodation', 'accommodative', 'accompany', 'accomplish', 'accomplished', 'accord', 'accordance', 'according', 'accordingly', 'account', 'accountability', 'accountable', 'accountancy', 'accountant', 'accounting', 'accredited', 'accretive', 'accrue', 'accuracy', 'accurate', 'accurately', 'accusation'

In [5]:
bow

<3608x9966 sparse matrix of type '<class 'numpy.float64'>'
	with 575506 stored elements in Compressed Sparse Row format>

### Word2Vec

In [28]:
model = nlp_u.df_to_vec(df)
model.wv.most_similar(positive="high")

[('low', 0.7797523140907288),
 ('hit', 0.7690551280975342),
 ('record', 0.654312789440155),
 ('highest', 0.6522954106330872),
 ('galloping', 0.6289225220680237),
 ('bullion', 0.6248561143875122),
 ('mid', 0.615885317325592),
 ('gold', 0.604141116142273),
 ('heavy', 0.6038429737091064),
 ('level', 0.603442907333374)]

### Topic models : LDA and NMF

In [45]:
import nlp_utilities as nlp_u

#Fresh start
search_words = ['Google', 'Exxon']
news_to_read = "Reuters"
format_cols = ["Text", "Author", "Date"]
dict_u.reset_dict(dict_dir)
data_u.add_news_to_dict(search_words, data_folder, news_to_read, dict_dir, format_cols)
data_dict = dict_u.get_dict(dict_dir)

df = data_dict['google']
n_words = 20

#LDA
print("Fitting LDA model (tf features)")
X, lda, countvect, feat2word = nlp_u.df_to_lda(df, n_topics = 5, TF = True)
feature_names = countvect.get_feature_names()

print("Topics in tf-LDA model:")
nlp_u.print_topics(lda, feature_names, n_words)

print("\nFitting LDA model (BOW features)")
X, lda, countvect, feat2word = nlp_u.df_to_lda(df, n_topics = 5, TF = False)
feature_names = countvect.get_feature_names()

print("Topics in BOW-LDA model:")
nlp_u.print_topics(lda, feature_names, n_words)

#NMF
print("\nFitting NMF model")
nmf, countvect, feat2word = nlp_u.df_to_nmf(df, n_topics = 5)
feature_names = countvect.get_feature_names()

print("Topics in NMF model:")
nlp_u.print_topics(nmf, feature_names, n_words)


Fitting LDA model (tf features)
Topics in tf-LDA model:
Topic #0: percent said new stock year york company dow rose market index content billion time yahoo high medium higher video average
Topic #1: said video web new site year also medium clip journal york yahoo advertising deal time music service company content last
Topic #2: said percent company new apple year billion yahoo market million would stock quarter also search business time last could one
Topic #3: percent billion said stock web company cash new year fell market would index growth week also york average last high
Topic #4: said percent new yahoo year web company stock billion quarter search million business advertising mobile video york also share system

Fitting LDA model (BOW features)
Topics in BOW-LDA model:
Topic #0: noble energy nook wind solar grid power electric duke smart gas toy oil book bookstore electricity renewable gold project lan
Topic #1: said court patent case commission government information federal la

# Start learning

## Get features

In [4]:
import nlp_utilities as nlp_u
import dict_utilities as dict_u
import data_utilities as data_u

data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers available : {data_dict.keys()}")
print(f"Number of news labelled for Google : {len(data_dict['google'])}")

Tickers available : dict_keys([])


KeyError: 'google'

In [50]:
data_dict["google"].tail(5)

Unnamed: 0,Text,Author_x,Date_x,Label_x,Author_y,Date_y,Label_y
11190,By Amy Thomson and Matthew Campbell Nov. 25 (...,,NaT,,[A m y T h o m s o n a n d M a t t h e w...,2013-11-25 00:01:00,-0.780701
11191,"Softbank Corp. (9984) , the Japanesemajority o...",,NaT,,[E h r e n G o o s s e n s],2013-11-25 18:27:48,-0.780701
11192,Working in secret like the programsthey’re rev...,,NaT,,[M a r g a r e t T a l e v],2013-11-25 18:18:51,-0.780701
11193,People who sign up as drivers forUber Technolo...,,NaT,,[M a r k M i l i a n],2013-11-25 05:00:00,-0.780701
11194,"Yahoo! Inc. ’s push to hire Katie Couric , wh...",,NaT,,[r i a n W o m a c k a n d D a v i d H...,2013-11-25 17:26:38,-0.780701


### W2V features

In [6]:
%%time
#df = data_dict["google"]
X_w2v = nlp_u.get_w2v_features(df, stop_words = None, language = 'en', size=200, window=5, min_count=1)

Wall time: 1min


### TFIDF features

In [3]:
%%time
n_samples = len(data_dict["google"])
#df = data_dict["google"].iloc[0:n_samples]
X_bow = nlp_u.get_bow_features(df, stop_words = None, language = 'en', TFIDF = True)

Wall time: 56.5 s


In [9]:
X, countvect, feat2word =nlp_u.df_to_bow(df, stop_words = None, language = 'en', TFIDF = True)

In [10]:
for feat in feat2word:
    print(feat2word[feat])

san
new
york
wall
street
outdo
one
another
raising
stock
look
extreme
quarterly
revenue
rising
percent
two
faster
growth
like
yahoo
company
grip
web
search
market
closed
saying
could
hit
raised
price
target
broker
earnings
estimate
share
included
prudential
think
get
soon
analyst
ben
said
may
even
trade
year
caution
rush
justify
higher
bullish
set
plummet
low
march
certainly
lot
closer
real
told
never
thought
totally
question
matter
time
aggressive
ridicule
saw
era
dot
excess
late
hot
friend
kiss
death
ad
general
exodus
fund
sector
much
greater
remain
standout
bypass
struggling
favor
end
early
come
back
say
strong
heady
profit
despite
heavy
capital
equipment
making
text
poised
variety
video
view
business
good
major
understatement
reiterated
month
ahead
consensus
among
financial
following
third
report
institutional
tend
ignore
attention
medium
retail
reality
paying
sophisticated
investor
sure
would
see
go
embolden
equity
strategist
miller
agreed
nobody
eye
additional
sander
service
prov

wholesale
condition
insufficient
undermine
antitrust
nevertheless
unable
obtain
congress
pending
wide
strenuously
restriction
neutrality
limitation
demanding
neutral
cooper
stake
reportedly
atrial
banc
bear
happening
read
nonfarm
payroll
polled
extending
buoy
bearish
cisco
check
worse
club
sending
surging
spinning
ownership
bout
debut
resignation
embattled
depot
felt
either
direction
man
returned
mourning
northeast
sliding
sparked
weigh
transportation
lagged
badly
spencer
abruptly
mutual
package
piper
loser
install
cell
la
arch
expanded
rim
easy
photo
resigned
gone
filing
chair
size
loan
governance
excite
chorus
kodak
guide
wish
father
sensation
technologist
inappropriate
equally
stated
handset
immediate
surf
hong
opera
internationally
inched
quickly
keynote
centered
needle
lukewarm
crowd
demonstration
wallpaper
buzz
declared
watershed
bach
fair
finally
linchpin
awave
graphic
manage
room
feedback
visually
peer
uphill
technician
momentum
seeing
reinvestment
mounted
snapped
lagging
upbea

apologize
false
fired
apology
afterward
unlock
dissatisfaction
collins
brotherhood
proposal
overseer
josh
agent
conditioned
complain
calculation
establish
overwhelmingly
prosecute
debating
dressed
uniform
fervor
monastic
error
truth
contradiction
jokingly
variant
freely
sown
dissension
enemy
punching
bag
openly
refer
peripheral
pleasure
bloody
annoying
eighty
admit
umbrella
unite
sun
suggest
clearance
yang
bowing
drastic
nonexecutive
rebuild
criticism
coach
navigational
caveat
pair
contain
electrical
legendary
dropout
garage
alto
inspired
native
execute
reinvigorate
abandoned
excitement
helm
extensive
weathered
wrestling
surrounding
acquirer
modify
function
mechanism
compliance
alternate
stem
enforced
baseless
cellular
chieftain
battery
wi
fi
nelson
bridge
exploratory
motivate
realm
grand
vincent
confirming
borne
frustration
landing
airport
realizing
ring
konstantin
uninterrupted
unified
ragged
mane
wiry
frame
bass
duff
musician
velvet
revolver
financially
contraband
equivalent
scene
h

dumb
wither
palatable
pole
shocking
screaming
distressed
captivate
legitimate
bow
overlap
foolish
fray
reiter
deliberate
divest
taint
offing
reasoned
purple
clad
dress
khaki
pant
icon
workplace
bury
animosity
neighbor
brass
arduous
charm
offensive
rarely
irreverence
yodel
engineer
maneuver
bureaucracy
dice
seemingly
silenced
fretted
summed
cowboy
comparatively
stodgy
behemoth
crisp
sunny
milled
grassy
plaza
octopus
applaud
celebrate
achievement
singing
lately
mess
morale
communicating
spirited
entrepreneurial
imbue
amazing
groom
unlocked
calculated
proposition
rebuff
procession
beside
playbook
economically
wren
bowel
brick
scan
archive
glean
painstaking
flaw
cliff
evangelism
headache
universally
semantic
librarian
flotsam
jetsam
latecomer
scanning
dickens
bleak
casting
useless
poison
brain
mixture
formidable
dilute
absorption
hangout
premiere
underway
thwart
untraded
wedded
resurrection
unwieldy
resign
exclusively
shadow
impending
discovered
unanswered
dent
reinvent
duration
toa
heeled

illustration
switched
anyway
shielding
doll
shelter
limbo
chin
resoundingly
catastrophic
characteristic
glory
comeback
eagle
tune
stung
dumping
shied
unstuck
erroneous
surveillance
civic
avid
singled
gratified
mysterious
litigator
racer
tonight
striving
mammoth
wellington
upheaval
seventh
cherry
thread
unload
extract
enact
chaotic
plate
panic
gilt
iceland
grinding
hardly
subside
aversion
firepower
lien
recapitalize
wounded
anecdotal
mobilize
underwrite
tumult
astonishing
freezing
protectionism
rational
allocation
irrationality
rationality
counteract
comprise
cheer
sporting
judiciously
withstood
seesaw
whisper
baked
quantify
stabilization
haul
posse
baxter
gild
mon
frozen
steam
batch
tempt
thaw
interbank
hallmark
sprinkling
unsure
drunken
aloud
supercharged
geneva
withering
greedy
noticeable
nitty
gritty
clawed
unstable
recessionary
mug
scent
accumulate
overshot
pall
brace
trundling
revitalize
puppet
celebration
ga
prudent
brave
dissipated
aborted
progressively
stampede
weighted
peso
zl

guesthouse
furor
opaque
observing
calmly
rationally
immature
utterly
taboo
consular
seldom
limitless
capitulate
liberalize
clover
tripling
petition
aero
politicize
gesture
unfettered
punish
scissors
hazardous
walsh
courtroom
contending
reroute
tea
leeway
judicial
bewildering
touchy
religion
cursory
gong
cult
weapon
damp
dissent
disabled
corrupt
harmonious
inclination
educated
gaggle
fei
nationalist
salute
provenance
confuse
wrongful
forbid
informative
counterfeit
hello
secretive
mandarin
scrapping
forensic
uphold
misbehavior
fortescue
lenient
guinea
citizenship
poem
bombshell
blockage
tribute
teng
inconvenience
blast
consulate
subordinate
obispo
picturesque
suction
possession
unsung
spectacle
cloak
sexy
polytechnic
soul
combed
magic
derby
accurately
divulge
sundry
drill
transistor
pin
millimeter
tricky
magical
melting
dental
plugging
electron
sliced
capacitive
geek
wonderland
prowling
tub
bathroom
piled
clutter
grandfather
manipulator
thirty
bipartisan
stubbornly
loathe
nudge
multilate

sleep
pundit
untouched
wrapper
spec
momentous
neutrino
hitch
ported
bezel
toggle
monolithic
instrumentation
petite
myopic
intolerant
purist
avail
gilded
pedigree
replicated
elicit
variance
intriguing
flawless
dormant
voracious
addicted
refreshing
panorama
chilling
biometric
scanner
trifling
physic
forty
deficient
packet
inter
pager
synch
trapped
tactile
haptics
eponymous
torrid
sturdiness
rage
starry
progression
disclaimer
jasmine
vexed
nebulous
alternating
welcoming
unhelpful
preferential
favoring
abuzz
lent
spontaneously
certification
ware
slicker
bastion
expansive
valentine
ink
dude
nineteen
leniency
confidentiality
airing
void
degradation
dub
cling
furious
prejudicial
unindicted
coaching
reconstruct
rowan
unambiguous
unstoppable
avoidance
unjustified
sally
amelia
complainant
requisite
detriment
walling
racy
risque
excise
mentally
cleanup
jettison
polo
recurrent
inadmissible
calculator
barnacle
tabulated
palace
hush
legged
necked
embarrass
stubborn
retool
checkered
intermix
ostensib

glamorous
inaugural
gradient
competency
ibex
plight
convoluted
soybean
broom
singularly
tirelessly
behaviour
lo
haphazard
debutante
soak
bum
scroll
doctrinally
fingerprint
lockup
overflow
copyist
smudge
orb
tether
wattage
sandpaper
flick
jam
zealously
impartial
laborious
protrude
lozenge
exasperated
retarding
hyperbole
humanity
variability
anachronism
firebird
cone
cockpit
driverless
socialization
centering
fender
curator
dearborn
toaster
acquittal
pointedly
transpire
staging
disenchantment
holdup
exempt
levity
inhibit
fuss
preaching
ominous
detached
polar
capstone
alchemy
exaggerate
respondent
purveyor
rework
bogus
rectangle
granular
blaze
nimbly
skirted
stylus
geometric
annex
hangar
rugged
tram
slater
swanky
creature
opulent
erecting
spaceship
iconoclastic
dissolve
dame
overrule
billable
comprehensible
preliminarily
upswing
hertz
truce
digestion
spiraled
surroundings
concierge
errant
mercer
tire
brawl
treatise
cashier
detach
recharge
upstate
benign
removable
punchy
thickness
sharpnes

extremist
seething
absurdity
inone
mosque
possessed
immeasurably
paranormal
ast
sdeath
merlin
popcorn
catamaran
resin
barlow
prescriptive
podium
leadin
toru
nudity
homosexual
rollback
laud
specie
defective
concurrently
doorman
synthesis
relish
akindle
unabashedly
photovoltaic
quaker
twant
electro
barish
upwardly
blackjack
texture
rector
bulldozer
bolivia
windowless
spawn
craziness
wildlife
diameter
asocial
benny
parkin
tunneling
diagonal
permissive
yearlong
blasphemy
authentic
kemp
otto
shading
masterpiece
reunion
appointee
bayou
radiation
ordinance
supervisor
binh
shipbuilding
shipyard
acne
colon
postponement
cough
respiratory
mattress
derogatory
tania
innocuous
herding
holographic
locale
pur
adolescent
maternal
akey
childbirth
rag
scavenge
symbolize
unblemished
schwarz
leaky
atechnical
areal
shakedown
jar
augur
woeful
navigable
waxman
welding
tinkerer
ashamed
hydrochloride
milligram
antibody
reciprocal
rot
primer
nongovernmental
nontraditional
delinquency
hazard
unpersuasive
imaginar

nitrogen
coronary
lori
rouge
martial
parental
tribble
despicable
garment
ling
postwar
clandestine
irregular
illustrious
hooker
calorie
partridge
durst
citrus
shearing
tarmac
scratcher
dinky
brighten
ma
postage
servant
tariff
agraphic
sud
obedient
deplorable
horrific
kosher
symptom
sleazy
extraterritorial
macao
incompetent
brock
silverleaf
descriptive
tutor
unify
relaxant
compressor
ovarian
filler
hove
retrospective
maddening
blasphemous
soiled
forlorn
walkway
sapphire
gent
chateau
palisade
oneanother
bogart
dempster
arraignment
grading
piggy
fin
allergic
glaucoma
hyperactivity
weaver
exam
tomato
willie
goodyear
moro
psychiatric
kensington
dicker
bombardment
reiterate
anarchy
jamboree
upstage
juxtaposition
arcadian
classicist
cabaret
devotion
aha
outlaw
mast
allergy
trusting
flier
unforeseen
satirist
jukebox
aunt
prolifically
hardback
handmaiden
scrivener
poe
scold
decency
intuition
offended
minutia
plex
plenary
blossom
voodoo
doughnut
misstatement
dint
unneeded
cag
redeemable
insectici

laminated
slapping
counteroffer
holm
ratification
nonvoting
realtor
grange
haveless
feller
wingman
rabid
extortion
lehrman
infantry
battalion
appraisal
reauthorization
chestnut
motorist
presidentially
mollie
skeletal
hapless
extremity
unshakable
witch
vendetta
sunglass
scenery
os
thoroughness
encyclopedic
extol
perseverance
iridium
perspiration
sweating
outsmart
prosaic
watchful
overstate
investigatory
nervously
busted
tic
rocher
matriarch
knockout
gargoyle
mediating
muffin
adda
statehood
purity
recreational
telescopic
refuel
crust
asteroid
colonize
steelmaking
featherweight
gander
verisimilitude
glistening
tanning
symbolism
pious
abolish
empt
thunderbird
brunch
rosario
dinette
bovine
spongiform
encephalopathy
milking
ruck
narration
undead
reclassification
scream
dachshund
anguished
autobiographical
reaffirm
burial
inartificial
kale
entrenchment
joinder
incrimination
kinsman
gnash
hothouse
cricketer
leech
undeniably
fleshed
pawn
ebony
starch
reeder
hustle
truffle
fennel
danner
gruesome

### LDA features

In [8]:
%%time
n_samples = len(data_dict["google"])
#df = data_dict["google"].iloc[0:n_samples]
X_lda = nlp_u.get_lda_features(df, n_topics = 5, stop_words = None, language = 'en', TF = True)

Wall time: 43.4 s


### NMF features

In [9]:
%%time
n_samples = len(data_dict["google"])
#df = data_dict["google"].iloc[0:n_samples]
X_nmf = nlp_u.get_nmf_features(df, n_topics=5, stop_words = None, language = 'en')

Wall time: 34.7 s


## Regression

In [13]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
#Add Voting regression when all have been tested ?

#The classifiers we will compare
clfs = { "Random Forests" : RandomForestRegressor(n_estimators=100, criterion="mse"),
         "Gradient Boosting" : GradientBoostingRegressor(n_estimators=100),
         "Decision Tree" : tree.DecisionTreeRegressor(),
         "SVR" : svm.SVR(kernel='rbf', C = 1),
         "Gaussian Process" : GaussianProcessRegressor(n_restarts_optimizer = 3),
         "Adaboost" : AdaBoostRegressor(tree.DecisionTreeRegressor(criterion='mse', max_depth = 3), n_estimators=100)}

#The models we will compare
models = { "LDA" : X_lda,
           "NMF" : X_nmf,
           "Word2Vec" : X_w2v,
           "TFIDF" : X_bow }

y = df["Label"]
results = {}
for model_name in models:
    for clf_name in clfs:
        X = models[model_name]
        clf = clfs[clf_name]
        scores = cross_val_score(clf, X, y)
        mean_score = np.mean(scores)
        print(model_name + " + " + clf_name + f" : {mean_score}")
        results[model_name + " + " + clf_name] = mean_score
        


LDA + Random Forests : -0.5285720766104164
LDA + Gradient Boosting : -0.012408617924239395
LDA + Decision Tree : -1.0945644862030006
LDA + SVR : -0.00034801313748857867
LDA + Gaussian Process : -0.0005595241881841906
LDA + Adaboost : -0.017410092752664495
NMF + Random Forests : -0.13634837727362328
NMF + Gradient Boosting : -0.02865087404502762
NMF + Decision Tree : -1.2420930818531388
NMF + SVR : -0.0027650161983818313
NMF + Gaussian Process : -0.009406285996548202
NMF + Adaboost : -0.08115315264524403
Word2Vec + Random Forests : -0.04796248183035905
Word2Vec + Gradient Boosting : -0.03618469700840725
Word2Vec + Decision Tree : -1.5694811844978613
Word2Vec + SVR : -0.005409387346561267
Word2Vec + Gaussian Process : -0.0003396798874599316
Word2Vec + Adaboost : -0.17299781633262587


KeyboardInterrupt: 

In [12]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

#The classifiers we will compare
clfs = { "Random Forests" : RandomForestRegressor(n_estimators=100, criterion="mse"),
         "Gradient Boosting" : GradientBoostingRegressor(n_estimators=100),
         "Decision Tree" : tree.DecisionTreeRegressor(),
         "SVR" : svm.SVR(kernel='rbf', C = 1),
         "Gaussian Process" : GaussianProcessRegressor(n_restarts_optimizer = 3),
         "Adaboost" : AdaBoostRegressor(tree.DecisionTreeRegressor(criterion='mse', max_depth = 3), n_estimators=100)}

#The models we will compare
models = { "TFIDF" : X_bow,
           "Word2Vec" : X_w2v,
           "LDA" : X_lda,
           "NMF" : X_nmf}

y = df["Label"]
results = {}
X = models[model_name]
clf = clfs[clf_name]
scores = cross_val_score(clf, X, y)
mean_score = np.mean(scores)
print(model_name + " + " + clf_name + f" : {mean_score}")
results[model_name + " + " + clf_name] = mean_score
        


(11195, 5)