In [None]:
%run data.ipynb
%run Clean_Tweets_preprocessing.ipynb

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [None]:
data = get_tweets_and_reddit()

data.loc[:,'text'] = clean_text(data['text'])

In [None]:
data.to_csv('../../raw_data/twitter_reddit_clean.csv')

In [None]:
data['label'].value_counts()

# Holdout method

In [None]:
from sklearn.model_selection import train_test_split

X = data['text'].values
y= data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

# Model baseline

In [None]:
# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

cross_val_score(estimator=pipeline,
               X=X_train,
               y=y_train,
               scoring='f1',
               cv=5,
               n_jobs=-1).mean()

# Tunning Model

In [None]:
pipeline.get_params()

In [None]:
params = {
    'tfidf__max_df': [1.0,0.8],
    'tfidf__max_features' : [None,100,1000],
    'tfidf__min_df': [1,0.5],
    'tfidf__ngram_range': [(1,1),(1,2)],
    'nb__alpha': [1]
}

search = GridSearchCV(estimator=pipeline,
                     n_jobs=-1,
                     scoring='f1',
                     param_grid=params)

search.fit(X_train,y_train)

In [None]:
search.best_params_

In [None]:
search.best_score_

# Test the model in the test set

In [None]:
best_model = search.best_estimator_

y_pred = best_model.predict(X_test)

f1_score(y_test, y_pred)

## Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(estimator=best_model,
                     X=X_test,
                     y_true=y_test)