In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../raw_data/tweets_cleaned',names=['tweets','label'],skiprows=1)
df.tail()

Unnamed: 0,tweets,label
4621,lose anger anxieti insecur depress medit buddh...,1
4622,anisalukman baik aja mani secret amp lie kalo ...,0
4623,pan ghey friend lean leftist side polit medic ...,1
4624,amarieb eeee nacho glasvega bad eh,0
4625,think go hit amp meat groceri store crazi hang...,0


In [3]:
df.shape

(4626, 2)

In [4]:
df['label'].value_counts()

1    2313
0    2313
Name: label, dtype: int64

In [5]:
df = df.dropna()

# Model

In [6]:
X = df['tweets']
y = df['label']

In [7]:
[X.values]

[array(['stormeranna life gener depress anxieti futur',
        'kellyclarkson amaz much depress parent divorc music help lot music empow mayb write one inner struggl abl dri tear find someon chang life need like',
        'sooooooo happppi pupppyy offici potti train also mom bought appl jack last night soo start day right',
        ...,
        'pan ghey friend lean leftist side polit medic depress showin face without makeup yall think itll',
        'amarieb eeee nacho glasvega bad eh',
        'think go hit amp meat groceri store crazi hangov achiev'],
       dtype=object)]

In [8]:
X.values[3800]

'fri happi food tweet sg'

In [9]:
X.isnull().sum()

0

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df = 0.8, ngram_range=(1, 2))),
    ('nb', MultinomialNB(alpha=1.0)),
])

pipeline

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.8, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [11]:
X

0            stormeranna life gener depress anxieti futur
1       kellyclarkson amaz much depress parent divorc ...
2       sooooooo happppi pupppyy offici potti train al...
3       today girl two month day feel like way fuckin ...
4       one mini poem helenswritingmentalhealthdepress...
                              ...                        
4621    lose anger anxieti insecur depress medit buddh...
4622    anisalukman baik aja mani secret amp lie kalo ...
4623    pan ghey friend lean leftist side polit medic ...
4624                   amarieb eeee nacho glasvega bad eh
4625    think go hit amp meat groceri store crazi hang...
Name: tweets, Length: 4618, dtype: object

In [12]:
cross_val_score(estimator=pipeline,
               X=X,
               y=y,
               scoring='f1',
               cv=5,
               n_jobs=-1).mean()

0.9167059795764129

## Model Parameters

In [13]:
tfid3 = TfidfVectorizer(ngram_range=(1,2))
nb3 = MultinomialNB(alpha = 1.0)

In [14]:
tfid3.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [15]:
vector3 = tfid3.transform(X)

In [16]:
tfid3.get_feature_names()

['aa',
 'aa poc',
 'aaa',
 'aaa shooter',
 'aaaa',
 'aaaa realli',
 'aaaaaaaand',
 'aaaaaaaand back',
 'aaaaaah',
 'aaaaaah officialquest',
 'aaaaaand',
 'aaaaaand depress',
 'aaaaargh',
 'aaaaargh also',
 'aaaaayyyyy',
 'aaaaayyyyy momma',
 'aaaah',
 'aaaah babi',
 'aaaawwwwww',
 'aaaawwwwww love',
 'aaah',
 'aaah see',
 'aaahhhh',
 'aaahhhh hypocrisi',
 'aabc',
 'aaca',
 'aacebad',
 'aaron',
 'aaron dor',
 'aaron right',
 'aawjti',
 'aawjti ocid',
 'ab',
 'ab challeng',
 'ab gp',
 'ab may',
 'ab month',
 'abandon',
 'abandon heartbroken',
 'abat',
 'abat depress',
 'abatevintag',
 'abatevintag fleurchipbowl',
 'abatevintag list',
 'abbey',
 'abbey church',
 'abbey year',
 'abbi',
 'abbi bestieessss',
 'abbrevi',
 'abbrevi sr',
 'abbuffata',
 'abbuffata cab',
 'abcacffcaeecd',
 'abdul',
 'abdul coolin',
 'abdulla',
 'abdulla shaikh',
 'abe',
 'abe lincoln',
 'abeautyhealthi',
 'abeautyhealthi www',
 'abel',
 'abel love',
 'abil',
 'abil accessal',
 'abil manag',
 'abil think',
 'abl',

In [17]:
nb3.fit(vector3, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
nb3.coef_

array([[-10.83882374, -10.83237508, -10.8227035 , ..., -10.86929887,
        -10.79040235, -10.79040235]])

In [19]:
coefs = pd.Series(nb3.coef_[0], index = tfid3.get_feature_names())

In [20]:
coefs.sort_values()

lol know        -10.981609
phlippin        -10.981609
phlippin love   -10.981609
flavor quot     -10.981609
flavor          -10.981609
                   ...    
face             -7.848994
pic              -7.618340
emoji            -7.511419
anxieti          -7.323297
depress          -6.074262
Length: 47971, dtype: float64

# Train the model including Reddit posts

## Import data

In [28]:
df = pd.read_csv('../raw_data/twitter_reddit_text.csv',names=['text','label'],skiprows=1)
df.isnull().sum()

text     10
label     0
dtype: int64

In [29]:
df = df.dropna()

## Holdout Method

In [35]:
from sklearn.model_selection import train_test_split

X = df['text'].values
y= df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [36]:
X_train

array(['swagbuck woohoo thank swagbuck',
       'go allow depress take life mental health go downhil go let',
       'go bed earli go burbank need regist extra cast central', ...,
       'yey thank much support',
       'know young hell virtual still kid mean like spend lot time think kid watch pokmon watch youtub like go outsid friend focus creativ littl thing know get extrem upset miss day never respons never worri whether next day go okay never deep root fear get old be grumpi old man peopl would want around use wake earli watch cartoon excit whole day ahead like sleep long even tire take away realiti situat use excit birthday christma halloween holiday remind slowli slowli get old one year closer end use super excit around peopl got littl shi sometim fear talk peopl mess can not fix anyth hate look piti thought go mind wall lack trust actual fear peopl use passion aliv spend day room everi singl god damn second puter forgot eat dinner today forgot someon remind ate appl use love lo

## Model

In [37]:
cross_val_score(estimator=pipeline,
               X=X_train,
               y=y_train,
               scoring='f1',
               cv=5,
               n_jobs=-1).mean()

0.8755270587177378

## Grid Search

In [38]:
from sklearn.model_selection import GridSearchCV

pipeline.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=1.0, max_features=None,
                   min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words=None,

In [44]:
params = {
    'tfidf__max_df': [0.7,0.8,0.9,1.0],
    'tfidf__max_features': [None,10,100,1000],
    'tfidf__ngram_range': [(1,1),(1,2),(2,2)]
}

search = GridSearchCV(estimator=pipeline,
                     n_jobs=-1,
                     scoring='f1',
                     param_grid=params)

search.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [45]:
search.best_params_

{'tfidf__max_df': 0.7,
 'tfidf__max_features': 1000,
 'tfidf__ngram_range': (1, 2)}

In [46]:
search.best_score_

0.919430037831987

In [48]:
best_model = search.best_estimator_

In [49]:
y_pred = best_model.predict(X_test)

In [52]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

f1_score(y_test,y_pred)

0.9193548387096773

In [53]:
roc_auc_score(y_test,y_pred)

0.9143029636577293

## Model parameters

In [56]:
tfid3 = TfidfVectorizer(ngram_range=(1,2), max_features=1000, max_df=0.7)
nb3 = MultinomialNB()

tfid3.fit(X_train)

vector3 = tfid3.transform(X_train)

nb3.fit(vector3, y_train)

coefs = pd.Series(nb3.coef_[0], index = tfid3.get_feature_names())

coefs.sort_values(ascending=False).head(30)

depress     -4.227890
feel        -4.283009
like        -4.527188
want        -4.635753
life        -4.841153
get         -4.855391
know        -4.912859
go          -4.926771
can         -5.040481
not         -5.041957
can not     -5.042328
time        -5.133754
peopl       -5.140968
even        -5.145155
feel like   -5.145583
thing       -5.225587
day         -5.235261
realli      -5.243224
think       -5.255516
would       -5.262516
help        -5.271047
tri         -5.274033
friend      -5.283072
year        -5.295550
make        -5.310442
one         -5.315518
fuck        -5.398387
work        -5.416942
never       -5.506018
much        -5.539064
dtype: float64