In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import regex as re

# Display the max for string
pd.set_option('max_colwidth', 50)

In [2]:
df = pd.read_csv('../dataset/combined.csv')

In [3]:
df.drop_duplicates(subset='title',inplace=True)

In [4]:
df['subreddit'].value_counts()

tea       986
Coffee    927
Name: subreddit, dtype: int64

<img src="http://www.clker.com/cliparts/k/L/Z/0/Q/x/red-check-mark.svg" style="float: left; margin: 0px 0px 0px 0px; height: 20px">

In [5]:
# As many posts have images in their self text, we will include titles in selftext to make up the text column
# Add title to selftext to fill up empty rows
df["title_text"] = df["title"].map(str) + ' ' + df["selftext"].map(str)
# Replacing newlines and /r/ reddit addition to subreddits
df = df.replace('\n','', regex=True)
df = df.replace('/r/','', regex=True)

In [6]:
X = df['title_text'].values.astype('str')
y = df['subreddit']

In [7]:
# Binary classification. Predicting Tea as our target as opposed to coffee
y.replace('Coffee', 0,inplace=True)
y.replace('tea',1,inplace=True)

In [8]:
# Tokenize
token = RegexpTokenizer(r'\w+')
X_token = df['title_text'].apply(token.tokenize)

In [9]:
# Stop words
X_token_stop = X_token.apply(lambda x : [i.lower() for i in x if not i.lower() in stopwords.words('english')])

In [10]:
# Lemmatize
lem = WordNetLemmatizer()
X_token_stop_lem = X_token_stop.apply(lambda x : [lem.lemmatize(str(i)) for i in x])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_token_stop_lem, y, stratify=y, random_state=42)

<img src="http://www.clker.com/cliparts/k/L/Z/0/Q/x/red-check-mark.svg" style="float: left; margin: 0px 0px 0px 0px; height: 20px">

In [23]:
pipe = Pipeline([
    ('cvec', CountVectorizer(lowercase=False,
                             analyzer = "word")),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression())
])

In [43]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'cvec', 'tfidf', 'lr', 'cvec__analyzer', 'cvec__binary', 'cvec__decode_error', 'cvec__dtype', 'cvec__encoding', 'cvec__input', 'cvec__lowercase', 'cvec__max_df', 'cvec__max_features', 'cvec__min_df', 'cvec__ngram_range', 'cvec__preprocessor', 'cvec__stop_words', 'cvec__strip_accents', 'cvec__token_pattern', 'cvec__tokenizer', 'cvec__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'lr__C', 'lr__class_weight', 'lr__dual', 'lr__fit_intercept', 'lr__intercept_scaling', 'lr__l1_ratio', 'lr__max_iter', 'lr__multi_class', 'lr__n_jobs', 'lr__penalty', 'lr__random_state', 'lr__solver', 'lr__tol', 'lr__verbose', 'lr__warm_start'])

In [25]:
pipe_params = {
    'cvec__max_features': [500,1000,1500,2000,2500, 3000, 3500],
    'cvec__min_df': [2, 3, 4],
    'cvec__max_df': [.75, .85, .9, .95],
    'cvec__ngram_range': [(1, 1),(1,2)],
    'lr__l1_ratio': np.linspace(0,1,6),
}

In [26]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs=-1,verbose = 1)
gs.fit(X_train.astype(str), y_train)

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 5040 out of 5040 | elapsed:  3.7min finished
  "(penalty={})".format(self.penalty))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                           

In [27]:
print(gs.best_score_)
gs.best_params_

0.9804741980474198


{'cvec__max_df': 0.75,
 'cvec__max_features': 1000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'lr__l1_ratio': 0.0}

In [28]:
gs.score(X_train.astype(str), y_train)

0.9909344490934449

In [29]:
gs.score(X_test.astype(str), y_test)

0.9812108559498957

In [30]:
y_pred = gs.predict(X_test.astype(str))

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       232
           1       0.99      0.97      0.98       247

    accuracy                           0.98       479
   macro avg       0.98      0.98      0.98       479
weighted avg       0.98      0.98      0.98       479



<img src="http://www.clker.com/cliparts/k/L/Z/0/Q/x/red-check-mark.svg" style="float: left; margin: 0px 0px 0px 0px; height: 20px">

In [51]:
# Other model
pipe2 = Pipeline([
    ('cvec', CountVectorizer(lowercase=False, strip_accents='unicode', analyzer = "word")),
    ('multinb', MultinomialNB())
])

In [52]:
pipe2.get_params()

{'memory': None,
 'steps': [('cvec',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=False, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, vocabulary=None)),
  ('multinb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'verbose': False,
 'cvec': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                 lowercase=False, max_df=1.0, max_features=None, min_df=1,
                 ngram_range=(1, 1), preprocessor=None, stop_words=None,
                 strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, vocabular

In [53]:
pipe2_params = {
    'cvec__max_features': [500,1000,1500,2000,2500, 3000, 3500],
    'cvec__min_df': [2, 3, 4],
    'cvec__max_df': [.75, .85, .9, .95],
    'cvec__ngram_range': [(1, 1),(1,2)]
}

In [54]:
gs2 = GridSearchCV(pipe2, param_grid=pipe2_params, cv=5, n_jobs=-1,verbose = 1)
gs2.fit(X_train.astype(str), y_train)

Fitting 5 folds for each of 168 candidates, totalling 840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 840 out of 840 | elapsed:   37.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                           

In [55]:
print(gs2.best_score_)
gs2.best_params_

0.9811715481171548


{'cvec__max_df': 0.75,
 'cvec__max_features': 2000,
 'cvec__min_df': 4,
 'cvec__ngram_range': (1, 2)}

In [56]:
gs2.score(X_train.astype(str), y_train)

0.9853556485355649

In [57]:
gs2.score(X_test.astype(str), y_test)

0.9749478079331941

In [58]:
y_pred = gs2.predict(X_test.astype(str))

In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       232
           1       0.98      0.97      0.98       247

    accuracy                           0.97       479
   macro avg       0.97      0.98      0.97       479
weighted avg       0.98      0.97      0.97       479



<img src="http://www.clker.com/cliparts/k/L/Z/0/Q/x/red-check-mark.svg" style="float: left; margin: 0px 0px 0px 0px; height: 20px">

<font color = red> Comments<br>
1. Analyse the results and choose the model.
</font>