# Here is a model for NLP that allows us to clean data and correctly categorize each post from reddit as it was classified.  Step by step the model will:
1) clean and process data
2) vectorize data
3) fit a model with spacy
4) score the model with a confusion matrix
5) test the model
6) pickle the model

In [15]:
# Imports
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import pandas as pd
import string
import re
import nltk

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tcnick12\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# Load in data, mined by Jonathan
df = pd.read_csv('reddit_data_slimmed.csv')

In [68]:
df.head()

Unnamed: 0.1,Unnamed: 0,content,subreddit
0,38,Can we please boycott Star Wars battlefront 2 ...,gaming
1,39,Join the Battle for Net Neutrality! Net Neutra...,gaming
2,40,EA deleted my Origin account and EA help is to...,gaming
3,41,R.I.P TotalBiscuit https://twitter.com/GennaBa...,gaming
4,42,Leaked Harry Potter game in development by Roc...,gaming


In [69]:
# Drop the unnamed columns
df = df.drop(['Unnamed: 0'], axis=1)

In [72]:
df.head()

Unnamed: 0,content,subreddit
0,Can we please boycott Star Wars battlefront 2 ...,gaming
1,Join the Battle for Net Neutrality! Net Neutra...,gaming
2,EA deleted my Origin account and EA help is to...,gaming
3,R.I.P TotalBiscuit https://twitter.com/GennaBa...,gaming
4,Leaked Harry Potter game in development by Roc...,gaming


In [77]:
# Process data with this function
def cleaning_fn(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    4. Returns in lowercase.
    """
    # Check characters to see if they are in punctuation
    clean = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    clean = ''.join(clean)
    
    clean = clean.lower()
    
    # Now just remove any stopwords
    return [word for word in clean.split() if word.lower() not in stopwords.words('english')]

In [75]:
# Apply the function
df['clean_text'] = df['content'].apply(cleaning_fn)

Exception ignored in: <function SeekableUnicodeStreamReader.__del__ at 0x000002AE7475F798>
Traceback (most recent call last):
  File "C:\Users\tcnick12\Anaconda3\lib\site-packages\nltk\data.py", line 1281, in __del__
    self.close()
  File "C:\Users\tcnick12\Anaconda3\lib\site-packages\nltk\data.py", line 1310, in close
    self.stream.close()
KeyboardInterrupt


KeyboardInterrupt: 

In [76]:
df['content'].head()

0    Can we please boycott Star Wars battlefront 2 ...
1    Join the Battle for Net Neutrality! Net Neutra...
2    EA deleted my Origin account and EA help is to...
3    R.I.P TotalBiscuit https://twitter.com/GennaBa...
4    Leaked Harry Potter game in development by Roc...
Name: content, dtype: object

In [78]:
# Randomize the rows of the df so we don't have the iloc 1-100 all classified
# as one class, the next 200 as another, etc., so we don't have issues with
# a train/test split
df = df.sample(frac=1).reset_index(drop=True)

In [79]:
# Show the df['subreddit'] is no longer grouped by class but it sorted at random.
df.head()

Unnamed: 0,content,subreddit
0,What are some ways you made money while travel...,travel
1,Do Tectonic plates ever change in size and or ...,askscience
2,My Life Changing 4 Month Transformation / The ...,Fitness
3,My dad was bragging about his new hearing aid....,dadjokes
4,My mom sent me some old home videos for my bir...,nosleep


In [None]:
# Create a bag of words transformer using the 'cleaning_fn' from above
bow_transformer = CountVectorizer(analyzer=cleaning_fn).fit(df['content'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

In [43]:
# take one review and get its bag-of-words counts as a vector
review4 = df['clean_text'][3]
print(review4)

['went', 'night', 'closed', 'part', 'street', 'party', 'best', 'part', 'actually', 'group', 'guys', 'paid', 'table', 'another', 'group', 'people', 'convinced', 'take', 'pictures', 'facebook', 'place', 'lame']


In [44]:
# take one review and get its bag-of-words counts as a vector

bow4 = bow_transformer.transform(review4)
print(bow4)
print(bow4.shape)

  (0, 31277)	1
  (1, 19345)	1
  (2, 6409)	1
  (3, 20794)	1
  (4, 27404)	1
  (5, 20824)	1
  (6, 3727)	1
  (7, 20794)	1
  (8, 1474)	1
  (9, 13059)	1
  (10, 13197)	1
  (11, 20566)	1
  (12, 28058)	1
  (13, 2249)	1
  (14, 13059)	1
  (15, 21084)	1
  (16, 7174)	1
  (17, 28138)	1
  (18, 21386)	1
  (19, 10744)	1
  (20, 21566)	1
  (21, 16246)	1
(22, 32193)


In [45]:
# Check the words used in this example from review4.  This prints the 2nd and 3rd word used in that review.

print(bow_transformer.get_feature_names()[19345])
print(bow_transformer.get_feature_names()[6409])

night
closed


In [46]:
# Now transform the entire df
messages_bow = bow_transformer.transform(df['text'])

In [47]:
# Print shape of sparse mtx
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (10000, 32193)
Amount of Non-Zero occurences:  487809


In [48]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 31277)	1.0
  (1, 19345)	1.0
  (2, 6409)	1.0
  (3, 20794)	1.0
  (4, 27404)	1.0
  (5, 20824)	1.0
  (6, 3727)	1.0
  (7, 20794)	1.0
  (8, 1474)	1.0
  (9, 13059)	1.0
  (10, 13197)	1.0
  (11, 20566)	1.0
  (12, 28058)	1.0
  (13, 2249)	1.0
  (14, 13059)	1.0
  (15, 21084)	1.0
  (16, 7174)	1.0
  (17, 28138)	1.0
  (18, 21386)	1.0
  (19, 10744)	1.0
  (20, 21566)	1.0
  (21, 16246)	1.0


In [50]:
reviews_tfidf = tfidf_transformer.transform(messages_bow)

In [51]:
from sklearn.naive_bayes import MultinomialNB
review_model = MultinomialNB().fit(reviews_tfidf, df['stars'])

In [58]:
print('predicted:', review_model.predict(tfidf4)[0])
print('expected:', df.stars[2])

predicted: 5
expected: 3


In [57]:
all_predictions = review_model.predict(messages_tfidf)
print(all_predictions)

[1 5 5 ... 5 5 1]


In [59]:
from sklearn.metrics import classification_report
print (classification_report(df['stars'], all_predictions))

              precision    recall  f1-score   support

           1       0.96      0.22      0.36      1496
           2       0.00      0.00      0.00       759
           3       1.00      0.00      0.00      1098
           4       0.89      0.03      0.06      2185
           5       0.47      1.00      0.64      4462

    accuracy                           0.49     10000
   macro avg       0.66      0.25      0.21     10000
weighted avg       0.66      0.49      0.35     10000



  'precision', 'predicted', average, warn_for)


In [82]:
from sklearn.model_selection import train_test_split

X = df['content']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(25332,)
(6334,)
(25332,)
(6334,)


In [83]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=cleaning_fn)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
print(classification_report(predictions,y_test))

# To Do:
Include and option where we can suggest the top three subreddits. Pickle the model.